# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Setup
# common:
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import matplotlib.patches as mpatches
from scipy.stats import norm
from scipy import stats
import time
import folium
import collections
import eli5 # Feature importance evaluation
import urllib
from PIL import Image
import warnings
warnings.filterwarnings('ignore')
#读取数据集
df = pd.read_csv('hotel_bookings.csv')
df.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | ... | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | ... | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | ... | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | ... | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
5 rows × 32 columns
#数据基本信息描述
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 hotel 119390 non-null object 1 is_canceled 119390 non-null int64 2 lead_time 119390 non-null int64 3 arrival_date_year 119390 non-null int64 4 arrival_date_month 119390 non-null object 5 arrival_date_week_number 119390 non-null int64 6 arrival_date_day_of_month 119390 non-null int64 7 stays_in_weekend_nights 119390 non-null int64 8 stays_in_week_nights 119390 non-null int64 9 adults 119390 non-null int64 10 children 119386 non-null float64 11 babies 119390 non-null int64 12 meal 119390 non-null object 13 country 118902 non-null object 14 market_segment 119390 non-null object 15 distribution_channel 119390 non-null object 16 is_repeated_guest 119390 non-null int64 17 previous_cancellations 119390 non-null int64 18 previous_bookings_not_canceled 119390 non-null int64 19 reserved_room_type 119390 non-null object 20 assigned_room_type 119390 non-null object 21 booking_changes 119390 non-null int64 22 deposit_type 119390 non-null object 23 agent 103050 non-null float64 24 company 6797 non-null float64 25 days_in_waiting_list 119390 non-null int64 26 customer_type 119390 non-null object 27 adr 119390 non-null float64 28 required_car_parking_spaces 119390 non-null int64 29 total_of_special_requests 119390 non-null int64 30 reservation_status 119390 non-null object 31 reservation_status_date 119390 non-null object dtypes: float64(4), int64(16), object(12) memory usage: 29.1+ MB
#数据五数概括
df.describe()
| is_canceled | lead_time | arrival_date_year | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | booking_changes | agent | company | days_in_waiting_list | adr | required_car_parking_spaces | total_of_special_requests | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119386.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 103050.000000 | 6797.000000 | 119390.000000 | 119390.000000 | 119390.000000 | 119390.000000 |
| mean | 0.370416 | 104.011416 | 2016.156554 | 27.165173 | 15.798241 | 0.927599 | 2.500302 | 1.856403 | 0.103890 | 0.007949 | 0.031912 | 0.087118 | 0.137097 | 0.221124 | 86.693382 | 189.266735 | 2.321149 | 101.831122 | 0.062518 | 0.571363 |
| std | 0.482918 | 106.863097 | 0.707476 | 13.605138 | 8.780829 | 0.998613 | 1.908286 | 0.579261 | 0.398561 | 0.097436 | 0.175767 | 0.844336 | 1.497437 | 0.652306 | 110.774548 | 131.655015 | 17.594721 | 50.535790 | 0.245291 | 0.792798 |
| min | 0.000000 | 0.000000 | 2015.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 6.000000 | 0.000000 | -6.380000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 18.000000 | 2016.000000 | 16.000000 | 8.000000 | 0.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 62.000000 | 0.000000 | 69.290000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 69.000000 | 2016.000000 | 28.000000 | 16.000000 | 1.000000 | 2.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 14.000000 | 179.000000 | 0.000000 | 94.575000 | 0.000000 | 0.000000 |
| 75% | 1.000000 | 160.000000 | 2017.000000 | 38.000000 | 23.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 229.000000 | 270.000000 | 0.000000 | 126.000000 | 0.000000 | 1.000000 |
| max | 1.000000 | 737.000000 | 2017.000000 | 53.000000 | 31.000000 | 19.000000 | 50.000000 | 55.000000 | 10.000000 | 10.000000 | 1.000000 | 26.000000 | 72.000000 | 21.000000 | 535.000000 | 543.000000 | 391.000000 | 5400.000000 | 8.000000 | 5.000000 |
#数据缺失值展示
df.isnull().sum()
hotel 0 is_canceled 0 lead_time 0 arrival_date_year 0 arrival_date_month 0 arrival_date_week_number 0 arrival_date_day_of_month 0 stays_in_weekend_nights 0 stays_in_week_nights 0 adults 0 children 4 babies 0 meal 0 country 488 market_segment 0 distribution_channel 0 is_repeated_guest 0 previous_cancellations 0 previous_bookings_not_canceled 0 reserved_room_type 0 assigned_room_type 0 booking_changes 0 deposit_type 0 agent 16340 company 112593 days_in_waiting_list 0 customer_type 0 adr 0 required_car_parking_spaces 0 total_of_special_requests 0 reservation_status 0 reservation_status_date 0 dtype: int64
#数据缺失值处理
df.drop('company', axis=1, inplace=True)
df["meal"].replace("Undefined", "SC", inplace=True)
zero_guests = list(df.loc[df["adults"]
+ df["children"]
+ df["babies"]==0].index)
df.drop(df.index[zero_guests], inplace=True)
# df.drop(['country',
# 'arrival_date_year',
# 'arrival_date_week_number',
# 'arrival_date_day_of_month',
# 'stays_in_weekend_nights',
# 'stays_in_week_nights',
# 'days_in_waiting_list',
# 'required_car_parking_spaces',
# 'reservation_status'
# ],
# axis=1, inplace=True)
#将数值属性与类别属性分开
categorical_features = []
numerical_features = []
for column in df.columns:
if df[column].dtype == object:
categorical_features.append(column)
else:
numerical_features.append(column)
#绘制数值属性直方图
df.hist(bins=50, figsize=(20,15))
plt.tight_layout()
plt.show()
#入住顾客来自不同国家的的数量和比例
country_data = pd.DataFrame(df.loc[df["is_canceled"] == 0]["country"].value_counts())
country_data.rename(columns = {'country': 'guests number'}, inplace = True)
country_data['country'] = country_data.index
country_data['percent'] = country_data['guests number'] / country_data['guests number'].sum()
country_data.head()
| guests number | country | percent | |
|---|---|---|---|
| PRT | 20977 | PRT | 0.281231 |
| GBR | 9668 | GBR | 0.129615 |
| FRA | 8468 | FRA | 0.113527 |
| ESP | 6383 | ESP | 0.085574 |
| DEU | 6067 | DEU | 0.081338 |
#可视化展示
fig = plt.figure(figsize = (16,9))
ax = fig.add_subplot(121)
country_data_plot = country_data.iloc[:10, :]
font = {'weight': 'normal', 'size': 20}
ax.set_xlabel('Country', font)
ax.set_ylabel('Number', font)
ax.bar(country_data_plot['country'], country_data_plot['guests number'])
ax = fig.add_subplot(122)
country_pie_data = country_data.iloc[:10, :]
ax.pie(x = country_pie_data['guests number'], labels = country_pie_data['country'], autopct = '%.3f%%')
fig.suptitle('The Number of Guests From Different Countries',fontsize = 20)
ax.legend()
<matplotlib.legend.Legend at 0x7f0a16e4d640>
basemap = folium.Map()
guests_map = px.choropleth(country_data, locations = country_data['country'],
color = country_data['guests number'], hover_name = country_data['country'])
guests_map.show()
#统计城市酒店和节日酒店每个月份的入住游客数量作为酒店的预定需求
data_resort = df[(df['hotel'] == 'Resort Hotel') & (df['is_canceled'] == 0)]
data_city = df[(df['hotel'] == 'City Hotel') & (df['is_canceled'] == 0)]
resort_guests = data_resort['arrival_date_month'].value_counts().reset_index()
resort_guests.columns=['month','num of guests']
city_guests = data_city['arrival_date_month'].value_counts().reset_index()
city_guests.columns=['month','num of guests']
final_guests = resort_guests.merge(city_guests,on='month')
final_guests.columns=['month','num of guests in resort','num of guest in city hotel']
final_guests
| month | num of guests in resort | num of guest in city hotel | |
|---|---|---|---|
| 0 | August | 3257 | 5367 |
| 1 | July | 3137 | 4770 |
| 2 | October | 2575 | 4326 |
| 3 | March | 2571 | 4049 |
| 4 | April | 2550 | 4010 |
| 5 | May | 2535 | 4568 |
| 6 | February | 2308 | 3051 |
| 7 | September | 2102 | 4283 |
| 8 | June | 2037 | 4358 |
| 9 | December | 2014 | 2377 |
| 10 | November | 1975 | 2676 |
| 11 | January | 1866 | 2249 |
!pip install sort-dataframeby-monthorweek
!pip install sorted-months-weekdays
Requirement already satisfied: sort-dataframeby-monthorweek in /home/miaozzz/anaconda3/envs/tf-gpu/lib/python3.8/site-packages (0.4) Requirement already satisfied: sorted-months-weekdays in /home/miaozzz/anaconda3/envs/tf-gpu/lib/python3.8/site-packages (0.2)
import sort_dataframeby_monthorweek as sd
def sort_month(df, column_name):
return sd.Sort_Dataframeby_Month(df, column_name)
final_guests = sort_month(final_guests,'month')
px.line(final_guests, x = 'month', y = ['num of guests in resort','num of guest in city hotel'],
title='每月入住游客的总数')
resort_dataset = df[(df['hotel'] == 'Resort Hotel')]
city_dataset = df[(df['hotel'] == 'City Hotel')]
print('总体入住率为', round(df['is_canceled'].value_counts()[0]/len(df) * 100,2),'%')
print('度假酒店的入住率为:',round(resort_dataset['is_canceled'].value_counts()[0]/len(resort_dataset) * 100,2),'%')
print('城市酒店的入住率为:',round(city_dataset['is_canceled'].value_counts()[0]/len(city_dataset) * 100,2),'%')
总体入住率为 62.92 % 度假酒店的入住率为: 72.23 % 城市酒店的入住率为: 58.21 %
#类别属性入住率对比
plt.figure(figsize=(20, 20))
for i, feature in enumerate(categorical_features, 1):
plt.subplot(4, 4, i)
df[df["is_canceled"] == 0][feature].hist(bins=35, color='blue', label='Not Cancelation', alpha=0.6)
df[df["is_canceled"] == 1][feature].hist(bins=35, color='red', label='Cancelation', alpha=0.6)
plt.legend()
plt.xlabel(feature)
plt.ylabel('count')
plt.figure(figsize=(20, 20))
for i, feature in enumerate(numerical_features, 1):
plt.subplot(5, 4, i)
df[df["is_canceled"] == 0][feature].hist(bins=35, color='blue', label='Not Cancelation', alpha=0.6)
df[df["is_canceled"] == 1][feature].hist(bins=35, color='red', label='Cancelation', alpha=0.6)
plt.legend()
plt.xlabel(feature)
plt.ylabel('count')
filter = df['is_canceled'] == 0
data = df[filter]
data['total_nights'] = data['stays_in_weekend_nights'] + data['stays_in_week_nights']
stay = data.groupby(['total_nights', 'hotel']).agg('count').reset_index()
stay = stay.iloc[:, :3]
stay = stay.rename(columns={'is_canceled':'Number of stays'})
stay
| total_nights | hotel | Number of stays | |
|---|---|---|---|
| 0 | 0 | City Hotel | 251 |
| 1 | 0 | Resort Hotel | 371 |
| 2 | 1 | City Hotel | 9155 |
| 3 | 1 | Resort Hotel | 6579 |
| 4 | 2 | City Hotel | 10983 |
| ... | ... | ... | ... |
| 57 | 46 | Resort Hotel | 1 |
| 58 | 48 | City Hotel | 1 |
| 59 | 56 | Resort Hotel | 1 |
| 60 | 60 | Resort Hotel | 1 |
| 61 | 69 | Resort Hotel | 1 |
62 rows × 3 columns
px.bar(data_frame = stay, x = 'total_nights', y = 'Number of stays', color = 'hotel', barmode = 'group')
df['meal'].value_counts()/len(df['meal'])*100
BB 77.372704 HB 12.128177 SC 9.829712 FB 0.669407 Name: meal, dtype: float64
filter = df['is_canceled'] == 0
data = df[filter]
meal = data.groupby(['meal', 'hotel']).agg('count').reset_index()
meal = meal.iloc[:, :3]
meal = meal.rename(columns={'is_canceled':'Numbers'})
meal
| meal | hotel | Numbers | |
|---|---|---|---|
| 0 | BB | City Hotel | 35570 |
| 1 | BB | Resort Hotel | 22160 |
| 2 | FB | City Hotel | 9 |
| 3 | FB | Resort Hotel | 311 |
| 4 | HB | City Hotel | 3976 |
| 5 | HB | Resort Hotel | 5499 |
| 6 | SC | City Hotel | 6529 |
| 7 | SC | Resort Hotel | 957 |
px.bar(data_frame = meal, x = 'meal', y = 'Numbers', color = 'hotel', barmode = 'group')
#lead time
filter = df['is_canceled'] == 0
data = df[filter]
lead_time = data.groupby(['lead_time', 'hotel']).agg('count').reset_index()
lead_time = lead_time.iloc[:, :3]
lead_time = lead_time.rename(columns={'is_canceled':'Numbers'})
lead_time
| lead_time | hotel | Numbers | |
|---|---|---|---|
| 0 | 0 | City Hotel | 2769 |
| 1 | 0 | Resort Hotel | 3078 |
| 2 | 1 | City Hotel | 1622 |
| 3 | 1 | Resort Hotel | 1505 |
| 4 | 2 | City Hotel | 999 |
| ... | ... | ... | ... |
| 777 | 518 | City Hotel | 22 |
| 778 | 532 | Resort Hotel | 1 |
| 779 | 542 | Resort Hotel | 23 |
| 780 | 709 | Resort Hotel | 1 |
| 781 | 737 | Resort Hotel | 1 |
782 rows × 3 columns
px.bar(data_frame = lead_time, x = 'lead_time', y = 'Numbers', color = 'hotel', barmode = 'group')
# normalize price per night (adr):
df["adr_pp"] = df["adr"] / (df["adults"] + df["children"] + df["babies"])
full_data_guests = df.loc[df["is_canceled"] == 0] # only actual gusts
room_prices = full_data_guests[["hotel", "arrival_date_month", "adr_pp"]].sort_values("arrival_date_month")
room_prices = sort_month(room_prices,'arrival_date_month')
# boxplot:
plt.figure(figsize=(12, 8))
sns.boxplot(x="arrival_date_month",
y="adr_pp",
hue="hotel",
data=room_prices,
hue_order=["City Hotel", "Resort Hotel"],
fliersize=0)
plt.title("Price per night and person", fontsize=16)
plt.xlabel("Month", fontsize=16)
plt.ylabel("Price [EUR]", fontsize=16)
plt.legend(loc="upper right")
plt.ylim(0, 165)
plt.show()
就每晚平均价格而言City Hotel的最佳预定时间为每年冬季大概为10月份-3月份。Resort Hotel最佳预定时间为7、8月份预计1-3月份。
# Import label encoder
from sklearn import preprocessing
data = pd.read_csv("hotel_bookings.csv")
#删除空值
data = data.drop(['company'], axis = 1)
data = data.dropna(axis = 0)
#将类别属性进行编码
data['hotel'] = data['hotel'].map({'Resort Hotel':0, 'City Hotel':1})
data['arrival_date_month'] = data['arrival_date_month'].map({'January':1, 'February': 2, 'March':3, 'April':4, 'May':5, 'June':6, 'July':7,
'August':8, 'September':9, 'October':10, 'November':11, 'December':12})
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column.
data['customer_type']= label_encoder.fit_transform(data['customer_type'])
data['assigned_room_type'] = label_encoder.fit_transform(data['assigned_room_type'])
data['deposit_type'] = label_encoder.fit_transform(data['deposit_type'])
data['reservation_status'] = label_encoder.fit_transform(data['reservation_status'])
data['meal'] = label_encoder.fit_transform(data['meal'])
data['country'] = label_encoder.fit_transform(data['country'])
data['distribution_channel'] = label_encoder.fit_transform(data['distribution_channel'])
data['market_segment'] = label_encoder.fit_transform(data['market_segment'])
data['reserved_room_type'] = label_encoder.fit_transform(data['reserved_room_type'])
data['reservation_status_date'] = label_encoder.fit_transform(data['reservation_status_date'])
#逻辑回归预测预定是否会被取消
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
X = data.drop(['previous_cancellations'], axis = 1)
y = data['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print(X_train.shape,'\n')
print(y_train.shape,'\n')
(82315, 30) (82315,)
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
acc_lr = accuracy_score(y_test, y_pred_lr)
conf = confusion_matrix(y_test, y_pred_lr)
clf_report = classification_report(y_test, y_pred_lr)
print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")
Accuracy Score of Logistic Regression is : 0.7327858496525584
Confusion Matrix :
[[10718 1742]
[ 3757 4362]]
Classification Report :
precision recall f1-score support
0 0.74 0.86 0.80 12460
1 0.71 0.54 0.61 8119
accuracy 0.73 20579
macro avg 0.73 0.70 0.70 20579
weighted avg 0.73 0.73 0.72 20579